Conversation
| LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__); | ||
| } | ||
| } break; | ||
| case PROJECTOR_TYPE_UTUVL: |
There was a problem hiding this comment.
instead of duplicating the code block, adding it to the same case above instead:
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_UTUVL:There was a problem hiding this comment.
delete this file, reuse qwen2vl.cpp
There was a problem hiding this comment.
utu differs from qwen2.5 in several aspects. It's difficult to merge together
There was a problem hiding this comment.
can you list these differences?
There was a problem hiding this comment.
- change conv3d to linear
- const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true; chang to
const bool full_attn = (il + 1) % 8 == 0 || il == n_layer - 1; added il == n_layer-1, which n_layer = 27 - delete ff_gate_w in build_ffn
- exchange merge and windows attention
There was a problem hiding this comment.
ok thanks. please also leave this list in the code comment
tools/mtmd/models/utuvl.cpp
Outdated
| // loop over layers | ||
| for (int il = 0; il < n_layer; il++) { | ||
| const auto & layer = model.layers[il]; | ||
| const bool full_attn = (il + 1) % 8 == 0 || il == n_layer - 1; |
There was a problem hiding this comment.
what's the number 8 here? can it be a hparam ?
tools/mtmd/clip.cpp
Outdated
|
|
||
| if (use_window_attn) { | ||
| const int attn_window_size = 112; | ||
| const int attn_window_size = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? 112 : patch_size * 2 * 8; |
There was a problem hiding this comment.
extract these number to a new hparams instead.
hparams.attn_window_size
|
please also fix any failed check regarding code formatting |
convert_hf_to_gguf.py
Outdated
| if chkhsh == "9d70134b369a70e5735009b6de918f7581b5211f7c074d1f89f753aea8248af1": | ||
| res = "utu-vl" |
There was a problem hiding this comment.
Do not manually add these, they are generated by convert_hf_to_gguf_update.py, edit that and run it to get the correct entry here!
convert_hf_to_gguf.py
Outdated
| if hparams.get("moe_intermediate_size") is not None: | ||
| self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) | ||
| else: | ||
| self.gguf_writer.add_expert_feed_forward_length(hparams.get("intermediate_size", 0)) | ||
|
|
||
| if hparams.get("n_routed_experts") is not None: | ||
| self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) | ||
|
|
||
| if hparams.get("n_shared_experts") is not None: | ||
| self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) | ||
| else: | ||
| self.gguf_writer.add_expert_shared_count(0) | ||
|
|
||
| if hparams.get("routed_scaling_factor") is not None: | ||
| self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"]) | ||
| else: | ||
| self.gguf_writer.add_expert_weights_scale(1.0) | ||
|
|
||
| if hparams.get("norm_topk_prob") is not None and hparams["norm_topk_prob"]: | ||
| self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"]) |
There was a problem hiding this comment.
Don't get the same value in condition and assignment, use walrus assignment in the condition as seen elsewhere.
| # skip lm_head.weight if tie_word_embeddings is True | ||
| if self.hparams.get("tie_word_embeddings", False): | ||
| # Save token_embd for potential duplication as output if tie_word_embeddings is True | ||
| if name == "model.embed_tokens.weight": | ||
| self._token_embd = data_torch | ||
| if name == "lm_head.weight" or name == "model.lm_head.weight": | ||
| logger.info("Skipping tied output layer 'lm_head.weight' - will duplicate from token_embd.weight") | ||
| return [] |
There was a problem hiding this comment.
Don't do this on conversion, do it on model load like every other model.
| def add_vision_n_wa_pattern(self, value: int) -> None: | ||
| self.add_uint32(Keys.ClipVision.N_WA_PATTERN, value) | ||
| def add_vision_wa_layers(self, layers: Sequence[int]) -> None: | ||
| self.add_array(Keys.ClipVision.WA_LAYERS, layers) |
There was a problem hiding this comment.
revert this change, add a dedicated metadata instead
gguf-py/gguf/constants.py
Outdated
| USE_GELU = "clip.use_gelu" | ||
| USE_SILU = "clip.use_silu" | ||
| N_WA_PATTERN = "clip.vision.n_wa_pattern" # used by qwen2.5vl | ||
| WA_LAYERS = "clip.vision.wa_layers" # used by qwen2.5vl and utuvl |
There was a problem hiding this comment.
revert this change, add a dedicated metadata instead
convert_hf_to_gguf.py
Outdated
| # save window attention layers (full attention block indexes) | ||
| fullatt_block_indexes = hparams.get("fullatt_block_indexes") | ||
| assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for qwen2_5_vl" | ||
| n_wa_pattern = fullatt_block_indexes[0] + 1 | ||
| # validate n_wa_pattern | ||
| for i in range(1, len(fullatt_block_indexes)): | ||
| if fullatt_block_indexes[i] - fullatt_block_indexes[i - 1] != n_wa_pattern: | ||
| raise ValueError(f"Invalid fullatt_block_indexes: {fullatt_block_indexes}") | ||
| self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern) | ||
| self.gguf_writer.add_vision_wa_layers(fullatt_block_indexes) |
There was a problem hiding this comment.
revert any changes to qwen code
tools/mtmd/clip.cpp
Outdated
| hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup | ||
| const int warn_min_pixels = 1024 * hparams.n_merge * hparams.n_merge * hparams.patch_size * hparams.patch_size; | ||
| if (hparams.image_min_pixels < warn_min_pixels) { | ||
| LOG_WRN("%s: Youtu-VL models require at minimum 1024 image tokens to function correctly on grounding tasks\n", __func__); |
There was a problem hiding this comment.
is this warning true or just a blindly copy-paste?
There was a problem hiding this comment.
I just updated the code.
tools/mtmd/clip.cpp
Outdated
| return std::max(align_size, aligned); | ||
| }; | ||
|
|
||
| // Binary search with 0.02 step size |
There was a problem hiding this comment.
is this a binary or linear search? where is the binary part?
There was a problem hiding this comment.
sorry,I just fix annotation
There was a problem hiding this comment.
and changes to minja must be done on upstream project, not here
| for (auto & layer : wa_layers_vec) { | ||
| hparams.wa_layers.insert(layer); | ||
| } | ||
| hparams.set_limit_image_tokens(1, 62500); |
There was a problem hiding this comment.
are you sure your model actually support 62500 tokens per image? how did you calculate it?
| const int warn_min_pixels = 1 * hparams.n_merge * hparams.n_merge * hparams.patch_size * hparams.patch_size; | ||
| if (hparams.image_min_pixels < warn_min_pixels) { | ||
| LOG_WRN("%s: Youtu-VL models require at minimum 1 image tokens to function correctly on grounding tasks\n", __func__); |
There was a problem hiding this comment.
I don't see the point of this check. it's redundant. delete it
|
@ngxson The project associated with this pull request was previously deleted. I pushed the local offline project to the master branch, but I can't reopen the pull request. You can check the differences in the new commits. |
Support for the large youtu-vl model, which will be open-sourced soon.